Setup
Import multiband raster
# Import raster layer
ls <- raster::stack("D:/Dateien/Uni/Eagle_Master/Masterthesis/R_Masterthesis_SBC/testing/L8_reducers_2021_months_8-12.tif")
Select bands (predictors)
# # Subset to specific columns
# bands <- c("B_median","G_median","R_median","NIR_median","SWIR1_median","SWIR2_median","BSI_median","NDVI_median","NDMI_median")
# landsat <- subset(ls, bands)
# OR: remove only valid_pixel column
landsat <- ls[[-length(names(ls))]]
names(landsat)
## [1] "B_median" "B_mean" "B_stdDev" "G_median" "G_mean"
## [6] "G_stdDev" "R_median" "R_mean" "R_stdDev" "NIR_median"
## [11] "NIR_mean" "NIR_stdDev" "SWIR1_median" "SWIR1_mean" "SWIR1_stdDev"
## [16] "SWIR2_median" "SWIR2_mean" "SWIR2_stdDev" "BSI_median" "BSI_mean"
## [21] "BSI_stdDev" "NDVI_median" "NDVI_mean" "NDVI_stdDev" "EVI2_median"
## [26] "EVI2_mean" "EVI2_stdDev" "SAVI_median" "SAVI_mean" "SAVI_stdDev"
## [31] "MSAVI_median" "MSAVI_mean" "MSAVI_stdDev" "NDMI_median" "NDMI_mean"
## [36] "NDMI_stdDev" "NBR_median" "NBR_mean" "NBR_stdDev" "NBR2_median"
## [41] "NBR2_mean" "NBR2_stdDev" "NDWI_median" "NDWI_mean" "NDWI_stdDev"
Import training polygons
# Import dataset
samp <- shapefile("D:/Dateien/Uni/Eagle_Master/Masterthesis/Data/trainingdata/Nuichua/LC_classes_tejas.shp")
# Need to have a numeric id for each class - helps with rasterization later on.
samp@data$class_id <- as.integer(factor(samp@data$class_id))
samp@data$plot_id <- as.integer(factor(samp@data$plot_id))
samp@data$binary_id <- as.integer(factor(samp@data$binary_id))
# Set as data table
setDT(samp@data)
Define class parameters
# set class names
classes <- c("Evergreen.Forest","Dry.Forest","Costal.Shrubland","Inland.Shrubland",
"Bare.Ground","Agriculture","Water","Built.up")
# define color palette
cpal <- c("#162700","#a1a733","#5fa00a","#37590a","#f86b05","#c22341","#0500ff","#020103")
# save class definitions in df
classdf <- data.frame(classvalue = c(1,2,3,4,5,6,7,8),
classnames = classes,
color = cpal)
–> Raster and vector data have same CRS? If not, reproject training data.
Plot training data
mapview(samp, zcol = "landcover", layer.name = "Tejas landcover classes",
col.regions = list("#c22341","#f86b05","#020103","#5fa00a","#a1a733","#162700","#37590a","#0500ff"))
Rasterize training polygons
# Create raster template
template_rst <- raster(extent(landsat$B_median), # band B2 has resolution 10 m
resolution = 30,
crs = projection(landsat$B_median))
#samp_rst <- raster::rasterize(samp, template_rst, field = "class_id") #-> not working, don't know why :(
samp_rst <- raster("D:/Dateien/Uni/Eagle_Master/Masterthesis/R_Masterthesis_SBC/testing/LC_tejas_rasterized.tif") # import sample raster created in QGIS
Create spatial points from cells in training polygons
samp_dt <- as.data.table(rasterToPoints(samp_rst))
setnames(samp_dt, old = "LC_tejas_rasterized", new = "class_id")
points <- SpatialPointsDataFrame(coords = samp_dt[, .(x, y)],
data = samp_dt,
proj4string = samp_rst@crs)
mapview(points,zcol="class_id",layer.name="Point class ids")
Extract bandinfos from landsat scene at points
# dplyr syntax
dt <- terra::rast(landsat) %>%
terra::extract(y = terra::vect(points)) %>%
as.data.frame %>%
mutate(class_id = points@data$class_id) %>% # add the class names to each row
# left_join(y = unique(poly@data), by = c("class_id" = "id")) %>%
# mutate(class_id = NULL) %>% # this column is extra now, delete it
mutate(class_id = factor(class_id))
data.table::setDT(dt)
# Add landcover labels
dt <- dt %>%
mutate(landcover = case_when(
class_id == 1 ~ "Evergreen",
class_id == 2 ~ "Dry Forest",
class_id == 3 ~ "Coastal Shrubland",
class_id == 4 ~ "Inland Shrubland",
class_id == 5 ~ "Bare Ground",
class_id == 6 ~ "Agriculture",
class_id == 7 ~ "Water",
class_id == 8 ~ "Built-up")) %>%
mutate(landcover = factor(landcover))
# View the first 6 rows
head(dt)
## ID B_median B_mean B_stdDev G_median G_mean G_stdDev R_median R_mean
## 1: 1 389 389 0 727 727 0 671 671
## 2: 2 267 267 0 607 607 0 421 421
## 3: 3 271 271 0 619 619 0 424 424
## 4: 4 264 264 0 628 628 0 431 431
## 5: 5 278 278 10 643 643 4 452 452
## 6: 6 261 261 0 619 619 0 421 421
## R_stdDev NIR_median NIR_mean NIR_stdDev SWIR1_median SWIR1_mean SWIR1_stdDev
## 1: 0 3166 3166 0 2323 2323 0
## 2: 0 3585 3585 0 1734 1734 0
## 3: 0 3701 3701 0 1738 1738 0
## 4: 0 3670 3670 0 1736 1736 0
## 5: 3 3498 3498 129 1643 1643 73
## 6: 0 3657 3657 0 1672 1672 0
## SWIR2_median SWIR2_mean SWIR2_stdDev BSI_median BSI_mean BSI_stdDev
## 1: 1375 1375 0 -856 -856 0
## 2: 762 762 0 -2824 -2824 0
## 3: 756 756 0 -2951 -2951 0
## 4: 762 762 0 -2896 -2896 0
## 5: 759 759 13 -2864 -2864 22
## 6: 724 724 0 -3036 -3036 0
## NDVI_median NDVI_mean NDVI_stdDev EVI2_median EVI2_mean EVI2_stdDev
## 1: 6501 6501 0 4327 4327 0
## 2: 7898 7898 0 5421 5421 0
## 3: 7944 7944 0 5567 5567 0
## 4: 7897 7897 0 5512 5512 0
## 5: 7708 7708 61 5238 5238 167
## 6: 7934 7934 0 5516 5516 0
## SAVI_median SAVI_mean SAVI_stdDev MSAVI_median MSAVI_mean MSAVI_stdDev
## 1: 4235 4235 0 4069 4069 0
## 2: 5270 5270 0 5356 5356 0
## 3: 5386 5386 0 5512 5512 0
## 4: 5337 5337 0 5446 5446 0
## 5: 5102 5102 136 5134 5134 178
## 6: 5346 5346 0 5459 5459 0
## NDMI_median NDMI_mean NDMI_stdDev NBR_median NBR_mean NBR_stdDev NBR2_median
## 1: 1536 1536 0 3944 3944 0 2563
## 2: 3479 3479 0 6494 6494 0 3895
## 3: 3609 3609 0 6607 6607 0 3937
## 4: 3578 3578 0 6560 6560 0 3896
## 5: 3610 3610 33 6430 6430 158 3673
## 6: 3725 3725 0 6696 6696 0 3958
## NBR2_mean NBR2_stdDev NDWI_median NDWI_mean NDWI_stdDev class_id
## 1: 2563 0 -6267 -6267 0 3
## 2: 3895 0 -7104 -7104 0 3
## 3: 3937 0 -7133 -7133 0 3
## 4: 3896 0 -7077 -7077 0 3
## 5: 3673 265 -6890 -6890 82 3
## 6: 3958 0 -7105 -7105 0 3
## landcover
## 1: Coastal Shrubland
## 2: Coastal Shrubland
## 3: Coastal Shrubland
## 4: Coastal Shrubland
## 5: Coastal Shrubland
## 6: Coastal Shrubland
Histograms of predictors
dt %>%
select(-c("class_id","ID","landcover")) %>%
select(ends_with("median")) %>%
melt(measure.vars = names(.)) %>%
ggplot() +
geom_histogram(aes(value)) +
geom_vline(xintercept = 0, color = "gray70") +
facet_wrap(facets = vars(variable), ncol = 4)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
dt %>%
select(-c("class_id","ID","landcover")) %>%
select(ends_with("mean")) %>%
melt(measure.vars = names(.)) %>%
ggplot() +
geom_histogram(aes(value)) +
geom_vline(xintercept = 0, color = "gray70") +
facet_wrap(facets = vars(variable), ncol = 4)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
dt %>%
select(-c("class_id","ID","landcover")) %>%
select(ends_with("stdDev")) %>%
melt(measure.vars = names(.)) %>%
ggplot() +
geom_histogram(aes(value)) +
geom_vline(xintercept = 0, color = "gray70") +
facet_wrap(facets = vars(variable), ncol = 4)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
set.seed(321)
# A stratified random split of the data
idx_train <- createDataPartition(dt$class_id,
p = 0.7, # percentage of data as training
list = FALSE)
train <- dt[idx_train]
test <- dt[-idx_train]
train <- train %>% select(-c("ID","landcover"))
levels(train$class_id) <- classes
test <- test %>% select(-c("ID","landcover"))
levels(test$class_id) <- classes
table(train$class_id)
##
## Evergreen.Forest Dry.Forest Costal.Shrubland Inland.Shrubland
## 1592 644 1157 633
## Bare.Ground Agriculture Water Built.up
## 493 622 84 326
table(test$class_id)
##
## Evergreen.Forest Dry.Forest Costal.Shrubland Inland.Shrubland
## 681 276 495 271
## Bare.Ground Agriculture Water Built.up
## 210 266 36 139
Create cross-validation folds (splits the data into n random groups)
n_folds <- 10
set.seed(321)
folds <- createFolds(1:nrow(train), k = n_folds)
# Set the seed at each resampling iteration. Useful when running CV in parallel.
seeds <- vector(mode = "list", length = n_folds + 1) # +1 for the final model
for(i in 1:n_folds) seeds[[i]] <- sample.int(1000, n_folds)
seeds[n_folds + 1] <- sample.int(1000, 1) # seed for the final model
Set trainControl parameters
ctrl <- trainControl(summaryFunction = multiClassSummary,
method = "cv",
number = n_folds,
search = "grid",
classProbs = TRUE, # not implemented for SVM; will just get a warning
savePredictions = TRUE,
index = folds,
seeds = seeds)
Train the classifier
# Register a doParallel cluster, using 3/4 (75%) of total CPU-s
cl <- makeCluster(3/4 * detectCores())
registerDoParallel(cl)
model_rf <- caret::train(class_id ~ . , method = "rf", data = train,
importance = TRUE, # passed to randomForest()
# run CV process in parallel;
# see https://stackoverflow.com/a/44774591/5193830
allowParallel = TRUE,
tuneGrid = data.frame(mtry = c(2, 3, 4, 5, 8)),
trControl = ctrl)
stopCluster(cl); remove(cl)
# Unregister the doParallel cluster so that we can use sequential operations
# if needed; details at https://stackoverflow.com/a/25110203/5193830
registerDoSEQ()
saveRDS(model_rf, file = "D:/Dateien/Uni/Eagle_Master/Masterthesis/R_Masterthesis_SBC/processed/Classification/R/initial_model_rf.rds")
model_rf$times$everything # total computation time
## User System verstrichen
## 50.09 0.93 156.55
plot(model_rf, uniform=TRUE, main="Random forest")
#ggplot(model_rf) # same as above, but using the ggplot method from caret
Confusion matrix
(cm_rf <- confusionMatrix(data = predict(model_rf, newdata = test),
test$class_id))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Evergreen.Forest Dry.Forest Costal.Shrubland
## Evergreen.Forest 679 0 2
## Dry.Forest 0 266 0
## Costal.Shrubland 1 0 491
## Inland.Shrubland 1 4 1
## Bare.Ground 0 0 0
## Agriculture 0 6 1
## Water 0 0 0
## Built.up 0 0 0
## Reference
## Prediction Inland.Shrubland Bare.Ground Agriculture Water Built.up
## Evergreen.Forest 0 0 0 1 0
## Dry.Forest 4 11 19 0 5
## Costal.Shrubland 7 0 0 1 0
## Inland.Shrubland 259 1 0 0 0
## Bare.Ground 0 192 2 0 4
## Agriculture 1 4 240 1 9
## Water 0 0 0 33 0
## Built.up 0 2 5 0 121
##
## Overall Statistics
##
## Accuracy : 0.9608
## 95% CI : (0.9522, 0.9683)
## No Information Rate : 0.2869
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9524
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Evergreen.Forest Class: Dry.Forest
## Sensitivity 0.9971 0.9638
## Specificity 0.9982 0.9814
## Pos Pred Value 0.9956 0.8721
## Neg Pred Value 0.9988 0.9952
## Prevalence 0.2869 0.1163
## Detection Rate 0.2860 0.1120
## Detection Prevalence 0.2873 0.1285
## Balanced Accuracy 0.9976 0.9726
## Class: Costal.Shrubland Class: Inland.Shrubland
## Sensitivity 0.9919 0.9557
## Specificity 0.9952 0.9967
## Pos Pred Value 0.9820 0.9737
## Neg Pred Value 0.9979 0.9943
## Prevalence 0.2085 0.1142
## Detection Rate 0.2068 0.1091
## Detection Prevalence 0.2106 0.1120
## Balanced Accuracy 0.9936 0.9762
## Class: Bare.Ground Class: Agriculture Class: Water
## Sensitivity 0.91429 0.9023 0.91667
## Specificity 0.99723 0.9896 1.00000
## Pos Pred Value 0.96970 0.9160 1.00000
## Neg Pred Value 0.99173 0.9877 0.99872
## Prevalence 0.08846 0.1120 0.01516
## Detection Rate 0.08088 0.1011 0.01390
## Detection Prevalence 0.08340 0.1104 0.01390
## Balanced Accuracy 0.95576 0.9459 0.95833
## Class: Built.up
## Sensitivity 0.87050
## Specificity 0.99687
## Pos Pred Value 0.94531
## Neg Pred Value 0.99199
## Prevalence 0.05855
## Detection Rate 0.05097
## Detection Prevalence 0.05392
## Balanced Accuracy 0.93369
model_rf$finalModel
##
## Call:
## randomForest(x = x, y = y, mtry = min(param$mtry, ncol(x)), importance = TRUE, allowParallel = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 4.18%
## Confusion matrix:
## Evergreen.Forest Dry.Forest Costal.Shrubland Inland.Shrubland
## Evergreen.Forest 1584 0 6 1
## Dry.Forest 0 624 0 4
## Costal.Shrubland 7 1 1142 6
## Inland.Shrubland 0 5 14 611
## Bare.Ground 0 23 1 2
## Agriculture 0 48 3 2
## Water 0 0 1 0
## Built.up 0 9 1 3
## Bare.Ground Agriculture Water Built.up class.error
## Evergreen.Forest 0 1 0 0 0.005025126
## Dry.Forest 4 12 0 0 0.031055901
## Costal.Shrubland 0 0 1 0 0.012964564
## Inland.Shrubland 0 3 0 0 0.034755134
## Bare.Ground 456 10 0 1 0.075050710
## Agriculture 4 556 0 9 0.106109325
## Water 1 4 76 2 0.095238095
## Built.up 22 21 0 270 0.171779141
caret::varImp(model_rf)$importance %>%
as.matrix %>%
plot_ly(x = colnames(.), y = rownames(.), z = ., type = "heatmap",
width = 600, height = 600)
randomForest::importance(model_rf$finalModel) %>%
.[, - which(colnames(.) %in% c("MeanDecreaseAccuracy", "MeanDecreaseGini"))] %>%
plot_ly(x = colnames(.), y = rownames(.), z = ., type = "heatmap",
width = 600, height = 600)
randomForest::varImpPlot(model_rf$finalModel)
Make prediction
system.time({
predict_rf_2 <- raster::predict(object = landsat,
model = model_rf,
type = 'prob' , index=1:8)
# predict_svm <- raster::predict(object = landsat,
# model = model_svm, type = 'raw')
# predict_nnet <- raster::predict(object = landsat,
# model = model_nnet, type = 'raw')
})
## User System verstrichen
## 244.27 10.65 312.08
Plot class probabilies
#viewRGB(brick(landsat[1:3]), r = 3, g = 2, b = 1) +
#mapView(predict_rf
#,col.regions = list("#c22341","#f86b05","#020103","#5fa00a","#a1a733","#162700","#37590a","#0500ff")
# )
# ,
# mapView(predict_svm, col.regions = cls_dt$hex),
# mapView(predict_nnet, col.regions = cls_dt$hex)
#plot(predict_rf_2)
levelplot(predict_rf_2)
for (i in 1:length(classdf$classvalue)){
mycol <- colorRampPalette(c("white", classdf$color[i]))
print(levelplot(predict_rf_2[[i]], col.regions=mycol, main=classdf$classnames[i]))
}
Combine layers to create final single layer product (–> same as discrete class layer below)
# # Option 1 with terra package
# predict_rf_2_comb <- terra::app(terra::rast(predict_rf_2)#, indices = c(1,2,3,4,5,6,7,8)
# , fun = which.max)
# str(predict_rf_2_comb)
# #levels(predict_rf_2_comb) <- as.character(classdf$classnames)
# levelplot(predict_rf_2_comb)
# Option 2 with raster package
which.max2 <- function(x, ...) which.max(x) # helper function to absorb "na.rm"
predict_rf_2_comb <- stackApply(predict_rf_2, rep(1,8), fun=which.max2, na.rm=NULL)
#predict_rf_2_comb <- as.factor(predict_rf_2_comb)
#levels(predict_rf_2_comb) <- as.character(classdf$classnames)
#predict_rf_2_comb@data@names <- c("1 = Evergreen Forest \n2 = Dry Forest \n3 = Coastal Shrubland \n4 = Inland Shrubland \n5 = Bare Ground \n6 = Agriculture \n7 = Water \n8 = Built-up")
#plot.new() # open new plot
levelplot(predict_rf_2_comb, main = "Class probabilities")
#+ layer(panel.text(predict_rf_2_comb@extent@xmin, predict_rf_2_comb@extent@ymin, text2add))
#mtext("1 = Evergreen Forest \n2 = Dry Forest \n3 = Coastal Shrubland \n4 = Inland Shrubland \n5 = Bare Ground \n6 = Agriculture \n7 = Water \n8 = Built-up", 1, line=0, adj=0, cex = 0.7)
class_df <- classdf[1:2]
data.table(class_df)
## classvalue classnames
## 1: 1 Evergreen.Forest
## 2: 2 Dry.Forest
## 3: 3 Costal.Shrubland
## 4: 4 Inland.Shrubland
## 5: 5 Bare.Ground
## 6: 6 Agriculture
## 7: 7 Water
## 8: 8 Built.up
Make prediction
system.time({
predict_rf <- raster::predict(object = landsat,
model = model_rf,
type = 'raw')
# predict_svm <- raster::predict(object = landsat,
# model = model_svm, type = 'raw')
# predict_nnet <- raster::predict(object = landsat,
# model = model_nnet, type = 'raw')
})
## User System verstrichen
## 214.72 7.49 238.11
Plot classes (–> same as combined probability layer above)
#plot
levelplot(predict_rf,col.regions=cpal,att='value')
# # Export probability rasters for each class
# for (i in 1:length(predict_rf_2)){
# raster::writeRaster(predict_rf_2[[i]], paste0( "D:/Dateien/Uni/Eagle_Master/Masterthesis/R_Masterthesis_SBC/testing/rf_",i,"class_prob.tif"))
# }
#
# # Export combined/single raster classification
# raster::writeRaster(predict_rf_2_comb, "D:/Dateien/Uni/Eagle_Master/Masterthesis/R_Masterthesis_SBC/testing/rf_prob_comb.tif")
# raster::writeRaster(predict_rf, "D:/Dateien/Uni/Eagle_Master/Masterthesis/R_Masterthesis_SBC/testing/rf_classes.tif")